Download Data

This notebook downloads the necessary data to replicate the results of our paper on Gender Inequalities on Wikipedia.

Note that we use a file named dbpedia_config.py where we set which language editions we will we study, as well as where to save and load data files.

By Eduardo Graells-Garrido.


In [1]:
!cat dbpedia_config.py


# The DBpedia editions we will consider
MAIN_LANGUAGE = 'en'
LANGUAGES = 'en|bg|ca|cs|de|es|eu|fr|hu|id|it|ja|ko|nl|pl|pt|ru|tr|ar|el'.split('|')

# Where are we going to download the data files
#DATA_FOLDER = '/home/egraells/resources/dbpedia'
DATA_FOLDER = '/media/egraells/113A88F901102CA6/data/dbpedia_2015'

# Folder to store analysis results
TARGET_FOLDER = '/home/egraells/phd/notebooks/pajaritos/person_results'

# This is used when crawling WikiData.
QUERY_WIKIDATA_GENDER = False
YOUR_EMAIL = 'mail@example.com'

In [2]:
import subprocess
import os
import dbpedia_config

In [3]:
target = dbpedia_config.DATA_FOLDER
languages = dbpedia_config.LANGUAGES

In [4]:
# Ontology
# note that previously (2014 version and earlier) this was in bzip format.
if not os.path.exists('{0}/dbpedia.owl'.format(target)):
    subprocess.call(['/usr/bin/wget', 
        'http://downloads.dbpedia.org/2015-10/dbpedia_2015-10.owl',
        '-O', '{0}/dbpedia.owl'.format(target)], 
        stdout=None, stderr=None)

In [5]:
# current version: http://wiki.dbpedia.org/Downloads2015-04
db_uri = 'http://downloads.dbpedia.org/2015-10/core-i18n'

In [16]:
for lang in languages:
    if not os.path.exists('{0}/instance_types_{1}.ttl.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            '{1}/{0}/instance_types_{0}.ttl.bz2'.format(lang, db_uri),
            '-O', '{0}/instance_types_{1}.ttl.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
    
    if not os.path.exists('{0}/interlanguage_links_{1}.ttl.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            '{1}/{0}/interlanguage_links_{0}.ttl.bz2'.format(lang, db_uri),
            '-O', '{0}/interlanguage_links_{1}.ttl.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
    
    if not os.path.exists('{0}/labels_{1}.ttl.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            '{1}/{0}/labels_{0}.ttl.bz2'.format(lang, db_uri),
            '-O', '{0}/labels_{1}.ttl.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
        
    if not os.path.exists('{0}/mappingbased_literals_{1}.ttl.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            '{1}/{0}/mappingbased_literals_{0}.ttl.bz2'.format(lang, db_uri),
            '-O', '{0}/mappingbased_literals_{1}.ttl.bz2'.format(target, lang)], 
            stdout=None, stderr=None)
        
    if not os.path.exists('{0}/mappingbased_objects_{1}.ttl.bz2'.format(target, lang)):
        subprocess.call(['/usr/bin/wget', 
            '{1}/{0}/mappingbased_objects_{0}.ttl.bz2'.format(lang, db_uri),
            '-O', '{0}/mappingbased_objects_{1}.ttl.bz2'.format(target, lang)], 
            stdout=None, stderr=None)

In [7]:
# http://oldwiki.dbpedia.org/Datasets/NLP#h172-7
dbpedia_gender = 'http://wifo5-04.informatik.uni-mannheim.de/downloads/datasets/genders_en.nt.bz2'

if not os.path.exists('{0}/genders_en.nt.bz2'.format(target)):
    subprocess.call(['/usr/bin/wget', 
        dbpedia_gender,
        '-O', '{0}/genders_en.nt.bz2'.format(target)], 
        stdout=None, stderr=None)

In [8]:
# http://www.davidbamman.com/?p=12
# note that, in previous versions, this was a text file. now it's a bzipped file with n-triplets.
wikipedia_gender = 'http://www.ark.cs.cmu.edu/bio/data/wiki.genders.txt'

if not os.path.exists('{0}/wiki.genders.txt'.format(target)):
    subprocess.call(['/usr/bin/wget', 
        dbpedia_gender,
        '-O', '{0}/wiki.genders.txt'.format(target)], 
        stdout=None, stderr=None)

In [11]:
if not os.path.exists('{0}/long_abstracts_{1}.nt.bz2'.format(target, dbpedia_config.MAIN_LANGUAGE)):
    subprocess.call(['/usr/bin/wget', 
        '{1}/{0}/long_abstracts_{0}.ttl.bz2'.format(dbpedia_config.MAIN_LANGUAGE, db_uri),
        '-O', '{0}/long_abstracts_{1}.ttl.bz2'.format(target, dbpedia_config.MAIN_LANGUAGE)], 
        stdout=None, stderr=None)

In [12]:
# network data for english only
if not os.path.exists('{0}/page_links_{1}.ttl.bz2'.format(target, dbpedia_config.MAIN_LANGUAGE)):
    subprocess.call(['/usr/bin/wget', 
        '{1}/{0}/page_links_{0}.nt.bz2'.format(dbpedia_config.MAIN_LANGUAGE, db_uri),
        '-O', '{0}/page_links_{1}.ttl.bz2'.format(target, dbpedia_config.MAIN_LANGUAGE)], 
        stdout=None, stderr=None)

In [ ]: